We, Ruturajsinh Solanki, Kunal Vaghela, and Satya Akhil Govvala, hereby state that we have not communicated with or gained information in any way from any person or resource that would violate the College’s academic integrity policies, and that all work presented is our own. In addition, we also agree not to share our work in any way, before or after submission, that would violate the College’s academic integrity policies.
R version 4.2.2 (2022-10-31 ucrt) RStudio 2022.12.0+353 “Elsbeth Geranium” Release (7d165dcfc1b6d300eb247738db2c7076234f6ef0, 2022-12-03) for Windows, Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) RStudio/2022.12.0+353 Chrome/102.0.5005.167 Electron/19.1.3 Safari/537.36
library("tidyverse")
library("ggplot2")
library("plotly")
library("gridExtra")
library("patchwork")
library("viridis")
library("wordcloud")
This data is about how booking hotel rooms have been greatly transformed by online reservation channels, and this has also impacted how customers behave. A considerable amount of hotel bookings are cancelled, which is not beneficial for hotel.
The dataset contains different attributes of customers’ reservation details. It contains 36275 observations of 19 variables. All of them are non-NULL.
The detailed data dictionary is given below.
Hotel_Reservations <- read.csv("C:\\Users\\solan\\OneDrive\\Desktop\\Work\\St.Clair_sem_1\\DAB501_basic_stats\\DAB501\\Project\\Hotel_Reservations.csv", header=TRUE, stringsAsFactors=FALSE)
head(Hotel_Reservations)
## Booking_ID no_of_adults no_of_children no_of_weekend_nights no_of_week_nights
## 1 INN00001 2 0 1 2
## 2 INN00002 2 0 2 3
## 3 INN00003 1 0 2 1
## 4 INN00004 2 0 0 2
## 5 INN00005 2 0 1 1
## 6 INN00006 2 0 0 2
## type_of_meal_plan required_car_parking_space room_type_reserved lead_time
## 1 Meal Plan 1 0 Room_Type 1 224
## 2 Not Selected 0 Room_Type 1 5
## 3 Meal Plan 1 0 Room_Type 1 1
## 4 Meal Plan 1 0 Room_Type 1 211
## 5 Not Selected 0 Room_Type 1 48
## 6 Meal Plan 2 0 Room_Type 1 346
## arrival_year arrival_month arrival_date market_segment_type repeated_guest
## 1 2017 10 2 Offline 0
## 2 2018 11 6 Online 0
## 3 2018 2 28 Online 0
## 4 2018 5 20 Online 0
## 5 2018 4 11 Online 0
## 6 2018 9 13 Online 0
## no_of_previous_cancellations no_of_previous_bookings_not_canceled
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## avg_price_per_room no_of_special_requests booking_status
## 1 65.00 0 Not_Canceled
## 2 106.68 1 Not_Canceled
## 3 60.00 0 Canceled
## 4 100.00 0 Canceled
## 5 94.50 0 Canceled
## 6 115.00 1 Canceled
str(Hotel_Reservations)
## 'data.frame': 36275 obs. of 19 variables:
## $ Booking_ID : chr "INN00001" "INN00002" "INN00003" "INN00004" ...
## $ no_of_adults : int 2 2 1 2 2 2 2 2 3 2 ...
## $ no_of_children : int 0 0 0 0 0 0 0 0 0 0 ...
## $ no_of_weekend_nights : int 1 2 2 0 1 0 1 1 0 0 ...
## $ no_of_week_nights : int 2 3 1 2 1 2 3 3 4 5 ...
## $ type_of_meal_plan : chr "Meal Plan 1" "Not Selected" "Meal Plan 1" "Meal Plan 1" ...
## $ required_car_parking_space : int 0 0 0 0 0 0 0 0 0 0 ...
## $ room_type_reserved : chr "Room_Type 1" "Room_Type 1" "Room_Type 1" "Room_Type 1" ...
## $ lead_time : int 224 5 1 211 48 346 34 83 121 44 ...
## $ arrival_year : int 2017 2018 2018 2018 2018 2018 2017 2018 2018 2018 ...
## $ arrival_month : int 10 11 2 5 4 9 10 12 7 10 ...
## $ arrival_date : int 2 6 28 20 11 13 15 26 6 18 ...
## $ market_segment_type : chr "Offline" "Online" "Online" "Online" ...
## $ repeated_guest : int 0 0 0 0 0 0 0 0 0 0 ...
## $ no_of_previous_cancellations : int 0 0 0 0 0 0 0 0 0 0 ...
## $ no_of_previous_bookings_not_canceled: int 0 0 0 0 0 0 0 0 0 0 ...
## $ avg_price_per_room : num 65 106.7 60 100 94.5 ...
## $ no_of_special_requests : int 0 1 0 0 0 1 1 1 1 3 ...
## $ booking_status : chr "Not_Canceled" "Not_Canceled" "Canceled" "Canceled" ...
sum(is.null(Hotel_Reservations))
## [1] 0
sum(is.na(Hotel_Reservations))
## [1] 0
# Create the histogram plot
plot1 <- ggplot(Hotel_Reservations, aes(x = avg_price_per_room)) +
geom_histogram(binwidth = 1, color = "darkblue", fill = "lightblue") +
labs(x = "Average Price per Room", y = "Frequency", title = "Distribution of Average Price per Room") +
theme_minimal() +
theme(plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10))
# Create the boxplot
plot2 <- ggplot(Hotel_Reservations, aes(y = avg_price_per_room)) +
geom_boxplot(color = "darkblue", fill = "lightblue") +
labs(y = "Average Price per Room", title = "Distribution of Average Price per Room (Boxplot)")+
scale_y_continuous(breaks = scales::breaks_width(50)) +
theme_minimal() +
theme(plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10))
# Combine the two plots
plot1 + plot2
filtered_avg_price <- Hotel_Reservations %>%
filter(avg_price_per_room > 25, avg_price_per_room <= 175)
head(filtered_avg_price)
## Booking_ID no_of_adults no_of_children no_of_weekend_nights no_of_week_nights
## 1 INN00001 2 0 1 2
## 2 INN00002 2 0 2 3
## 3 INN00003 1 0 2 1
## 4 INN00004 2 0 0 2
## 5 INN00005 2 0 1 1
## 6 INN00006 2 0 0 2
## type_of_meal_plan required_car_parking_space room_type_reserved lead_time
## 1 Meal Plan 1 0 Room_Type 1 224
## 2 Not Selected 0 Room_Type 1 5
## 3 Meal Plan 1 0 Room_Type 1 1
## 4 Meal Plan 1 0 Room_Type 1 211
## 5 Not Selected 0 Room_Type 1 48
## 6 Meal Plan 2 0 Room_Type 1 346
## arrival_year arrival_month arrival_date market_segment_type repeated_guest
## 1 2017 10 2 Offline 0
## 2 2018 11 6 Online 0
## 3 2018 2 28 Online 0
## 4 2018 5 20 Online 0
## 5 2018 4 11 Online 0
## 6 2018 9 13 Online 0
## no_of_previous_cancellations no_of_previous_bookings_not_canceled
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## avg_price_per_room no_of_special_requests booking_status
## 1 65.00 0 Not_Canceled
## 2 106.68 1 Not_Canceled
## 3 60.00 0 Canceled
## 4 100.00 0 Canceled
## 5 94.50 0 Canceled
## 6 115.00 1 Canceled
# Create the base histogram plot
base_plot <- ggplot(Hotel_Reservations, aes(x = avg_price_per_room, fill = ..count..)) +
geom_density(aes(y = ..count..), alpha = 0.1, fill = "blue", color = "black") +
labs(x = "Average Price per Room", y = "Count") +
theme_minimal() +
theme(legend.position = "right")
# Create the log scale plot
log_plot <- base_plot +
scale_x_log10() +
labs(title = "Logarithmic Scale")
# Create the square root scale plot
sqrt_plot <- base_plot +
scale_x_sqrt() +
labs(title = "Square Root Scale")
# Arrange the plots in a grid
grid.arrange(base_plot, log_plot, sqrt_plot, ncol = 1)
# Create the base histogram plot
base_plot <- ggplot(filtered_avg_price, aes(x = avg_price_per_room, fill = ..count..)) +
geom_density(aes(y = ..count..), alpha = 0.1, fill = "darkblue", color = "black") +
labs(x = "Average Price per Room", y = "Count") +
theme_minimal() +
theme(legend.position = "right")
# Create the log scale plot
log_plot <- base_plot +
scale_x_log10() +
labs(title = "Logarithmic Scale")
# Create the square root scale plot
sqrt_plot <- base_plot +
scale_x_sqrt() +
labs(title = "Square Root Scale")
# Arrange the plots in a grid
grid.arrange(base_plot, log_plot, sqrt_plot, ncol = 1)
# Create the histogram plot
plot1 <- ggplot(filtered_avg_price, aes(x = avg_price_per_room)) +
geom_histogram(binwidth = 1, color = "darkblue", fill = "lightblue") +
labs(x = "Average Price per Room", y = "Frequency", title = "Distribution of Average Price per Room") +
theme_minimal() +
theme(plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10))
# Create the boxplot
plot2 <- ggplot(filtered_avg_price, aes(y = avg_price_per_room)) +
geom_boxplot(color = "darkblue", fill = "lightblue") +
labs(y = "Average Price per Room", title = "Distribution of Average Price per Room (Boxplot)")+
scale_y_continuous(breaks = scales::breaks_width(50)) +
theme_minimal() +
theme(plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10))
# Combine the two plots
plot1 + plot2
mean(filtered_avg_price$avg_price_per_room)
## [1] 101.6938
median(filtered_avg_price$avg_price_per_room)
## [1] 99
mode(filtered_avg_price$avg_price_per_room)
## [1] "numeric"
table_mode_price <- table(filtered_avg_price$avg_price_per_room)
mode_price <- names(table_mode_price)[which.max(table_mode_price)]
mode_price
## [1] "65"
mad(filtered_avg_price$avg_price_per_room)
## [1] 28.1694
sd(filtered_avg_price$avg_price_per_room)
## [1] 26.86517
IQR(filtered_avg_price$avg_price_per_room)
## [1] 38.25
ggplot(filtered_avg_price, aes(x = room_type_reserved, fill = room_type_reserved)) +
geom_bar(alpha = 0.8) +
geom_text(stat = "count", aes(label = ..count..), vjust = -0.5, angle = 270) +
labs(title = "Distribution of Counts",
x = "Room Type Reserved",
y = "Reservation Count") +
scale_fill_discrete(name = "Room Type Reserved", labels = c("Room Type 1", "Room Type 2", "Room Type 3", "Room Type 4", "Room Type 5", "Room Type 6", "Room Type 7"))+
scale_color_discrete(name = "Room Type Reserved", labels = c("Room Type 1", "Room Type 2", "Room Type 3", "Room Type 4", "Room Type 5", "Room Type 6", "Room Type 7"))+
scale_x_discrete(labels=c("Room_Type 1" = "Room Type 1", "Room_Type 2" = "Room Type 2", "Room_Type 3" = "Room Type 3", "Room_Type 4" = "Room Type 4", "Room_Type 5" = "Room Type 5", "Room_Type 6" = "Room Type 6", "Room_Type 7" = "Room Type 7" )) +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10)) +
coord_flip()
# Create a table of counts by room type
room_counts <- table(filtered_avg_price$room_type_reserved)
# Calculate the proportion of each room type
room_props <- prop.table(room_counts)
# Create a data frame of the proportions
room_props_df <- data.frame(
room_type = names(room_props),
proportion = room_props
)
# Sort the data frame by proportion in increasing order
room_props_df <- room_props_df[order(room_props_df$proportion.Freq),]
# Create a bar chart of room proportions
ggplot(room_props_df, aes(x = proportion.Var1, y = proportion.Freq, fill = room_type)) +
geom_bar(stat = "identity", color = "black", width = 0.7) +
labs(title = "Proportion of Room Types",
x = "Room Types",
y = "overall Booking Proportion") +
scale_y_continuous(labels = scales::percent_format(accuracy = 1))+
scale_fill_discrete(name = "Room Type Reserved", labels = c("Room Type 1", "Room Type 2", "Room Type 3", "Room Type 4", "Room Type 5", "Room Type 6", "Room Type 7"))+
scale_color_discrete(name = "Room Type Reserved", labels = c("Room Type 1", "Room Type 2", "Room Type 3", "Room Type 4", "Room Type 5", "Room Type 6", "Room Type 7"))+
scale_x_discrete(labels=c("Room_Type 1" = "Room Type 1", "Room_Type 2" = "Room Type 2", "Room_Type 3" = "Room Type 3", "Room_Type 4" = "Room Type 4", "Room_Type 5" = "Room Type 5", "Room_Type 6" = "Room Type 6", "Room_Type 7" = "Room Type 7" ))+
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
legend.position = c(0.95, 0.95),
legend.justification = c("right", "top"),
legend.title.align = 0.5,
legend.text.align = 0.5,
legend.margin = margin(5, 5, 5, 5))
# Set theme options
theme_set(theme_bw(base_size = 18))
# Create plot
ggplot(filtered_avg_price, aes(x = avg_price_per_room, y = lead_time)) +
geom_point(color = "grey", size = 3, alpha = 0.5) +
geom_density_2d(color = "black", alpha = 1) +
labs(x = "Average Price per Room", y = "Lead Time (days)", title = "Scatter Plot of Average Price vs. Lead Time") +
theme(plot.title = element_text(hjust = 0.5, size = 24, face = "bold", margin = margin(b = 10)),
axis.title = element_text(size = 18, margin = margin(t = 10)),
axis.text = element_text(size = 14)) +
scale_x_continuous(breaks = seq(0, 500, by = 50)) +
scale_y_continuous(breaks = seq(0, 500, by = 50)) +
theme(panel.grid.major = element_line(color = "lightgray", linetype = "dotted"),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.line = element_line(color = "black", linewidth = 0.5),
legend.position = "none")
cor(filtered_avg_price$avg_price_per_room, filtered_avg_price$lead_time)
## [1] -0.07147322
# Filtering the most profitable rooms using average price per room
top_room <- filtered_avg_price %>%
group_by(room_type_reserved) %>%
summarise(avg_price_per_room = sum(avg_price_per_room)) %>%
mutate(percentage_price = avg_price_per_room/sum(avg_price_per_room)*100) %>%
arrange(desc(percentage_price))
# Create polar bar chart with values added
ggplot(top_room, aes(x = room_type_reserved, y = percentage_price, fill = room_type_reserved)) +
geom_bar(stat = "identity", width = 0.5) +
geom_text(aes(label = paste0(round(percentage_price,2),"%")),
position = position_stack(vjust = 0.9),
color = "black", size = 5) +
scale_fill_discrete(name = "Room Type Reserved", labels = c("Room Type 1", "Room Type 2", "Room Type 3", "Room Type 4", "Room Type 5", "Room Type 6", "Room Type 7"))+
scale_color_discrete(name = "Room Type Reserved", labels = c("Room Type 1", "Room Type 2", "Room Type 3", "Room Type 4", "Room Type 5", "Room Type 6", "Room Type 7"))+
scale_x_discrete(labels=c("Room_Type 1" = "Room Type 1", "Room_Type 2" = "Room Type 2", "Room_Type 3" = "Room Type 3", "Room_Type 4" = "Room Type 4", "Room_Type 5" = "Room Type 5", "Room_Type 6" = "Room Type 6", "Room_Type 7" = "Room Type 7" ))+
scale_y_reverse() +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
legend.position = c(0.95, 0.5),
legend.justification = c("right", "top"),
legend.title.align = 0.5,
legend.text.align = 0.5,
legend.margin = margin(5, 5, 5, 5)) +
labs(title = "Top Room Types by Percentage of Average Price Per Room",
x = "Room Type", y = "Percentage of Average Price Per Room")
# Create the histogram plot
plot1 <- ggplot(Hotel_Reservations, aes(x = lead_time)) +
geom_histogram(binwidth = 1, color = "black", fill = "pink") +
labs(x = "Lead Time", y = "Frequency", title = "Distribution of Lead Time") +
theme_minimal() +
theme(plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10))
# Create the boxplot
plot2 <- ggplot(Hotel_Reservations, aes(x = "", y = lead_time)) +
geom_violin(color = "black", fill = "pink") +
labs(y = "Lead Time", title = "Distribution of Lead Time (Violin Plot)")+
scale_y_continuous(breaks = scales::breaks_width(50)) +
theme_minimal() +
theme(plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10))
# Combine the two plots
plot1 + plot2
filtered_lead_time <- Hotel_Reservations %>%
filter(lead_time >= 0 , lead_time <= 180)
head(filtered_lead_time)
## Booking_ID no_of_adults no_of_children no_of_weekend_nights no_of_week_nights
## 1 INN00002 2 0 2 3
## 2 INN00003 1 0 2 1
## 3 INN00005 2 0 1 1
## 4 INN00007 2 0 1 3
## 5 INN00008 2 0 1 3
## 6 INN00009 3 0 0 4
## type_of_meal_plan required_car_parking_space room_type_reserved lead_time
## 1 Not Selected 0 Room_Type 1 5
## 2 Meal Plan 1 0 Room_Type 1 1
## 3 Not Selected 0 Room_Type 1 48
## 4 Meal Plan 1 0 Room_Type 1 34
## 5 Meal Plan 1 0 Room_Type 4 83
## 6 Meal Plan 1 0 Room_Type 1 121
## arrival_year arrival_month arrival_date market_segment_type repeated_guest
## 1 2018 11 6 Online 0
## 2 2018 2 28 Online 0
## 3 2018 4 11 Online 0
## 4 2017 10 15 Online 0
## 5 2018 12 26 Online 0
## 6 2018 7 6 Offline 0
## no_of_previous_cancellations no_of_previous_bookings_not_canceled
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## avg_price_per_room no_of_special_requests booking_status
## 1 106.68 1 Not_Canceled
## 2 60.00 0 Canceled
## 3 94.50 0 Canceled
## 4 107.55 1 Not_Canceled
## 5 105.61 1 Not_Canceled
## 6 96.90 1 Not_Canceled
# Create the base histogram plot
base_plot <- ggplot(Hotel_Reservations, aes(x = lead_time, fill = ..count..)) +
geom_density(aes(y = ..count..), alpha = 0.3, fill = "red", color = "black") +
geom_histogram(fill = "red", color = "black", alpha = 0.1) +
labs(x = "Lead Time", y = "Count") +
theme_minimal() +
theme(legend.position = "right")
# Create the log scale plot
log_plot <- base_plot +
scale_x_log10() +
labs(title = "Logarithmic Scale")
# Create the square root scale plot
sqrt_plot <- base_plot +
scale_x_sqrt() +
labs(title = "Square Root Scale")
# Arrange the plots in a grid
grid.arrange(base_plot, log_plot, sqrt_plot, ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Create the base histogram plot
base_plot <- ggplot(filtered_lead_time, aes(x = lead_time, fill = ..count..)) +
geom_density(aes(y = ..count..), alpha = 0.3, fill = "red", color = "black")+
geom_histogram(fill = "red", color = "black", alpha = 0.1) +
labs(x = "Lead Time", y = "Count") +
theme_minimal() +
theme(legend.position = "right")
# Create the log scale plot
log_plot <- base_plot +
scale_x_log10() +
labs(title = "Logarithmic Scale")
# Create the square root scale plot
sqrt_plot <- base_plot +
scale_x_sqrt() +
labs(title = "Square Root Scale")
# Arrange the plots in a grid
grid.arrange(base_plot, log_plot, sqrt_plot, ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
filtered_lead_time$sqrt_lead_time <- sqrt(filtered_lead_time$lead_time)
# Save the transformed data to a new file
write.csv(filtered_lead_time, "filtered_lead_time_sqrt.csv", row.names = FALSE)
filtered_lead_time_sqrt <- read.csv("C:\\Users\\solan\\OneDrive\\Desktop\\Work\\St.Clair_sem_1\\DAB501_basic_stats\\DAB501\\Project\\filtered_lead_time_sqrt.csv")
# Create the histogram plot
plot1 <- ggplot(filtered_lead_time_sqrt, aes(x = lead_time)) +
geom_histogram(binwidth = 1, color = "black", fill = "pink") +
labs(x = "Lead Time", y = "Frequency", title = "Distribution of Lead Time") +
theme_minimal() +
theme(plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10))
# Create the boxplot
plot2 <- ggplot(filtered_lead_time_sqrt, aes(y = lead_time)) +
geom_boxplot(color = "black", fill = "pink") +
labs(y = "Lead Time", title = "Distribution of Lead Time (Boxplot)")+
scale_y_continuous(breaks = scales::breaks_width(50)) +
theme_minimal() +
theme(plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10))
# Combine the two plots
plot1 + plot2
mean(filtered_lead_time_sqrt$lead_time)
## [1] 56.6123
median(filtered_lead_time_sqrt$lead_time)
## [1] 43
mode(filtered_lead_time_sqrt$lead_time)
## [1] "numeric"
table_mode_price <- table(filtered_lead_time_sqrt$lead_time)
mode_price <- names(table_mode_price)[which.max(table_mode_price)]
mode_price
## [1] "0"
mad(filtered_lead_time_sqrt$lead_time)
## [1] 51.891
sd(filtered_lead_time_sqrt$lead_time)
## [1] 49.76219
IQR(filtered_lead_time_sqrt$lead_time)
## [1] 78
filtered_lead_time_segment <- filtered_lead_time_sqrt %>%
mutate(grouped_segment_type = fct_lump(market_segment_type, n = 2, other_level = "Others"))
# Calculate counts by the new grouping variable
segment_count <- table(filtered_lead_time_segment$grouped_segment_type)
# Create a bar chart of market segment proportions (pie)
ggplot(data.frame(segment_count), aes(x = "", y = segment_count, fill = names(segment_count))) +
geom_bar(stat = "identity", width = 0.8, alpha = 0.8, color = "black", position = position_dodge(width = 0.5)) +
geom_text(aes(label = paste0(segment_count)), size = 8, position = position_dodge(width = 0.8), vjust = 0.5, hjust = 0.5) +
labs(title = "Distribution of the reservations over Market Types",
fill = "Market Segment Type",
x = "Market Segment Type",
y = "Count of reservation using the Market Type") +
scale_fill_discrete(labels=c("Offline" = "Offline", "Online" = "Online", "Others" = "Others = Corporate, Aviation and Complementary" ))+
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
plot.title = element_text(hjust = 0.5),
legend.position = "bottom",
legend.direction = "horizontal",
legend.box = "horizontal",
legend.box.just = "center",
legend.title.align = 0.5,
legend.text.align = 0.5,
legend.margin = margin(5, 5, 5, 5))
## Don't know how to automatically pick scale for object of type <table>.
## Defaulting to continuous.
# Create the bar chart
ggplot(data.frame(segment_count), aes(x = "", y = segment_count, fill = names(segment_count))) +
geom_bar(stat = "identity", width = 1, alpha = 0.8, color = "black") +
geom_text(aes(label = paste0(round(segment_count/sum(segment_count)*100), "%")), position = position_stack(vjust = 0.5)) +
labs(title = "Distribution of Types",
fill = "Market Segment Type") +
theme_void() +
coord_polar(theta = "y") +
scale_fill_discrete(labels=c("Offline" = "Offline", "Online" = "Online", "Others" = "Others = Corporate, Aviation and Complementary" ))+
theme(plot.title = element_text(hjust = 0.5),
legend.position = "right",
legend.box = "horizontal",
legend.box.background = element_rect(color = "black", size = 0.5),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10))
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## Don't know how to automatically pick scale for object of type <table>.
## Defaulting to continuous.
ggplot(filtered_lead_time_segment, aes(x = lead_time, y = avg_price_per_room, fill = ..level..)) +
stat_density_2d(geom = "polygon") +
scale_fill_gradient(low = "pink", high = "#8B0000") +
labs(x = "Lead Time", y = "Average Price per Room", title = "Contour Plot of Lead Time and Average Price Per Room") +
theme_minimal() +
theme(plot.title = element_text(size = 18, hjust = 0.5, face = "bold", margin = margin(b = 10)),
axis.text = element_text(size = 12),
axis.title = element_text(size = 14, face = "bold", margin = margin(t = 10)),
legend.position = "right",
legend.text = element_text(size = 12),
panel.grid.major = element_line(color = "#EAEAEA"),
panel.background = element_rect(fill = "#FDFDFD"))
cor(filtered_lead_time_segment$lead_time, filtered_lead_time_segment$avg_price_per_room)
## [1] 0.0009239012
# Filter lead time into percentage
top_lead_time <- filtered_lead_time_segment %>%
group_by(grouped_segment_type) %>%
summarise(lead_time = sum(lead_time)) %>%
mutate(percentage_lead_time = lead_time/sum(lead_time)*100) %>%
arrange(desc(percentage_lead_time))
ggplot(top_lead_time, aes(x = "", y = percentage_lead_time, fill = grouped_segment_type)) +
geom_bar(stat = "identity", width = 1, alpha = 0.8, color = "black") +
geom_text(aes(label = paste0(round(percentage_lead_time, 1), "%: ", grouped_segment_type)),
position = position_stack(vjust = 0.5),
color = "black", size = 5) +
labs(title = "Distribution of Market segment Types over lead time",
fill = "Market Segment Type",
x = "Market Segment Type",
y = "Lead Time taken by reservation in Percentage") +
theme_void() +
scale_fill_discrete(labels=c( "Others" = "Others = Corporate, Aviation and Complementary" , "Offline" = "Offline", "Online" = "Online"))+
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
plot.title = element_text(hjust = 0.5),
legend.position = "bottom",
legend.direction = "horizontal",
legend.box = "horizontal",
legend.box.just = "center",
legend.title.align = 0.5,
legend.text.align = 0.5,
legend.margin = margin(5, 5, 5, 5))
# Create the histogram plot
plot1 <- ggplot(Hotel_Reservations, aes(x = avg_price_per_room)) +
geom_histogram(binwidth = 1, color = "brown", fill = "lightgreen") +
labs(x = "Average Price per Room", y = "Frequency", title = "Distribution of Average Price per Room") +
theme_minimal() +
theme(plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10))
# Create the violin plot
plot2 <- ggplot(Hotel_Reservations, aes(x = 1, y = avg_price_per_room)) +
geom_violin(color = "brown", fill = "lightgreen") +
labs(x = "", y = "Average Price per Room", title = "Distribution of Average Price per Room (Violin Plot)")+
scale_x_continuous(breaks = NULL) +
theme_minimal() +
theme(plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10))
# Combine the two plots
plot1 + plot2
### Question 2,3 and 4 will be the same answers. Because, in our dataset
we only have 2 countinuous varaibles.
# Create a new data frame with the counts and proportions of meal plans
meal_plan_counts <- Hotel_Reservations %>%
count(type_of_meal_plan) %>%
mutate(proportion = n/sum(n))
# Create a bar chart of the meal plan proportions
ggplot(meal_plan_counts, aes(x = type_of_meal_plan, y = n, fill = type_of_meal_plan)) +
geom_col() +
labs(title = "Count of Meal Plan Types",
x = "Meal Plan Type",
y = "Count") +
scale_fill_brewer(palette = "Set2") +
theme_minimal()
ggplot(filtered_avg_price)+
geom_violin(aes(avg_price_per_room, as.factor(type_of_meal_plan), color = type_of_meal_plan, fill = type_of_meal_plan), alpha = 0.2)+
theme(axis.text.x = element_text())+
scale_x_continuous(breaks = scales::breaks_width(50))+
geom_vline(aes(xintercept=mean(lead_time)), color="black", linetype="dashed", size = 1)+
annotate(
"text",
x = 180, y = 3,
label = "The mean Price",
vjust = 1, size = 5, color = "black"
)+
annotate(
"curve",
x = 200, y = 2.5,
xend = 90, yend = 2.5,
arrow = arrow(length = unit(0.2, "cm"), type = "closed"),
color = "black"
)+
labs(
title = "Average Price per Room Vs. Meal Plan type",
x = "Average Price per Room",
y = "Meal Plan type"
)+
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
ggplot(Hotel_Reservations, aes(x = arrival_month)) +
geom_bar(alpha = 0.8, fill = "pink") +
geom_text(stat = "count", aes(label = ..count..), vjust = 0.5, angle = 360) +
labs(title = "Distribution of Arrival counts per month",
x = "Arrival Month",
y = "Arrivals count") +
theme_minimal() +
scale_x_continuous(breaks = seq(1, 12, by = 1)) +
theme(plot.title = element_text(hjust = 0.5),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10)) +
coord_flip()
# Create a table of words by agency ID
dates <- lapply(split(Hotel_Reservations$arrival_month, Hotel_Reservations$arrival_month), function(x) paste(x, collapse = " "))
names(dates) <- NULL
# Create word frequency table
word_freq <- table(unlist(strsplit(tolower(unlist(dates)), "\\W+")))
# Plot word cloud
wordcloud(names(word_freq), freq = word_freq, scale = c(12, 0.1), min.freq = 1, random.order = FALSE, colors = brewer.pal(8, "Dark2"),
max.words = 31, random.color = TRUE, rot.per = 0.3,
font = 3, family = "sans",
main = "Agency ID Word Cloud",
col.main = "steelblue")
title("Agency ID Word Cloud", cex.main = 1.5)
ggplot(filtered_lead_time_sqrt)+
geom_violin(aes(lead_time, as.factor(booking_status), color = booking_status, fill = booking_status), alpha = 0.2)+
theme(axis.text.x = element_text())+
scale_x_continuous(breaks = scales::breaks_width(50))+
geom_vline(aes(xintercept=mean(lead_time)), color="black", linetype="dashed", size = 1)+
annotate(
"text",
x = 250, y = 2.5,
label = "The mean Lead Time",
vjust = 1, size = 5, color = "black"
)+
annotate(
"curve",
x = 200, y = 2.5,
xend = 90, yend = 2.5,
arrow = arrow(length = unit(0.2, "cm"), type = "closed"),
color = "black"
)+
labs(
title = "Average Lead time Vs. Booking Status",
x = "Lead Time",
y = "Booking Status"
)+
theme_minimal()
plot_ly(filtered_avg_price, x = ~avg_price_per_room, y = ~lead_time, z = ~arrival_date, color = ~lead_time,
colors = c("#FDE725", "#B4DE2C", "#58A636", "#0C7C59", "#025949"),
type = "scatter3d", mode = "markers") %>%
layout(scene = list(xaxis = list(title = "Average Price per Room", showbackground = T, backgroundcolor = "rgb(230, 230,230)", gridcolor = "rgb(255, 255, 255)", zerolinecolor = "rgb(255, 255, 255)"),
yaxis = list(title = "Lead Time", showbackground = T, backgroundcolor = "rgb(230, 230,230)", gridcolor = "rgb(255, 255, 255)", zerolinecolor = "rgb(255, 255, 255)"),
zaxis = list(title = "Arrival Date", showbackground = T, backgroundcolor = "rgb(230, 230,230)", gridcolor = "rgb(255, 255, 255)", zerolinecolor = "rgb(255, 255, 255)")),
title = "Scatter Plot of Average Price, Lead Time, and Arrival Date")